A social network of a karate club was studied by Wayne W. Zachary [1] for a period of three years from 1970 to 1972. The network captures 34 members of a karate club, documenting 78 pairwise links between members who interacted outside the club. During the study a conflict arose between the administrator "John A" and instructor "Mr. Hi" (pseudonyms), which led to the split of the club into two. Half of the members formed a new club around Mr. Hi, members from the other part found a new instructor or gave up karate. Basing on collected data Zachary assigned correctly all but one member of the club to the groups they actually joined after the split.
[1] W. Zachary, An information flow model for conflict and fission in small groups, Journal of Anthropological Research 33, 452-473 (1977)
In [1]:
import swat
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
# Also import networkx used for rendering a network
import networkx as nx
%matplotlib inline
In [2]:
s = swat.CAS('http://cas.mycompany.com:8888') # REST API
In [3]:
s.loadactionset('hypergroup')
Out[3]:
Data set used from https://en.wikipedia.org/wiki/Zachary%27s_karate_club.
In [4]:
df = pd.DataFrame.from_records([[2,1],[3,1],[3,2],[4,1],[4,2],[4,3],[5,1],[6,1],[7,1],[7,5],[7,6],[8,1],[8,2],[8,3],[8,4],[9,1],[9,3],[10,3],[11,1],[11,5],[11,6],[12,1],[13,1],[13,4],[14,1],[14,2],[14,3],[14,4],[17,6],[17,7],[18,1],[18,2],[20,1],[20,2],[22,1],[22,2],[26,24],[26,25],[28,3],[28,24],[28,25],[29,3],[30,24],[30,27],[31,2],[31,9],[32,1],[32,25],[32,26],[32,29],[33,3],[33,9],[33,15],[33,16],[33,19],[33,21],[33,23],[33,24],[33,30],[33,31],[33,32],[34,9],[34,10],[34,14],[34,15],[34,16],[34,19],[34,20],[34,21],[34,23],[34,24],[34,27],[34,28],[34,29],[34,30],[34,31],[34,32],[34,33]],
columns=['FROM','TO'])
df['SOURCE'] = df['FROM'].astype(str)
df['TARGET'] = df['TO'].astype(str)
df.head()
Out[4]:
Hypergroup doesn't support numeric source and target columns - so make sure to cast them as varchars.
In [5]:
if s.tableexists('karate').exists:
s.CASTable('KARATE').droptable()
dataset = s.upload(df,
importoptions=dict(filetype='csv',
vars=[dict(type='double'),
dict(type='double'),
dict(type='varchar'),
dict(type='varchar')]),
casout=dict(name='KARATE', promote=True)).casTable
In [6]:
dataset.head(5)
Out[6]:
In [7]:
dataset.summary()
Out[7]:
In [8]:
def renderNetworkGraph(filterCommunity=-1, size=18, sizeVar='_HypGrp_',
colorVar='', sizeMultipler=500, nodes_table='nodes',
edges_table='edges'):
''' Build an array of node positions and related colors based on community '''
nodes = s.CASTable(nodes_table)
if filterCommunity >= 0:
nodes = nodes.query('_Community_ EQ %F' % filterCommunity)
nodes = nodes.to_frame()
nodePos = {}
nodeColor = {}
nodeSize = {}
communities = []
i = 0
for nodeId in nodes._Value_:
nodePos[nodeId] = (nodes._AllXCoord_[i], nodes._AllYCoord_[i])
if colorVar:
nodeColor[nodeId] = nodes[colorVar][i]
if nodes[colorVar][i] not in communities:
communities.append(nodes[colorVar][i])
nodeSize[nodeId] = max(nodes[sizeVar][i],0.1)*sizeMultipler
i += 1
communities.sort()
# Build a list of source-target tuples
edges = s.CASTable(edges_table)
if filterCommunity >= 0:
edges = edges.query('_SCommunity_ EQ %F AND _TCommunity_ EQ %F' %
(filterCommunity, filterCommunity))
edges = edges.to_frame()
edgeTuples = []
for i, p in enumerate(edges._Source_):
edgeTuples.append( (edges._Source_[i], edges._Target_[i]) )
# Add nodes and edges to the graph
plt.figure(figsize=(size,size))
graph = nx.DiGraph()
graph.add_edges_from(edgeTuples)
# Size mapping
getNodeSize=[nodeSize[v] for v in graph]
# Color mapping
jet = cm = plt.get_cmap('jet')
getNodeColor=None
if colorVar:
getNodeColor=[nodeColor[v] for v in graph]
cNorm = colors.Normalize(vmin=min(communities), vmax=max(communities))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
# Using a figure here to work-around the fact that networkx doesn't
# produce a labelled legend
f = plt.figure(1)
ax = f.add_subplot(1,1,1)
for community in communities:
ax.plot([0],[0], color=scalarMap.to_rgba(community),
label='Community %s' % '{:2.0f}'.format(community), linewidth=10)
# Render the graph
nx.draw_networkx_nodes(graph, nodePos, node_size=getNodeSize,
node_color=getNodeColor, cmap=jet)
nx.draw_networkx_edges(graph, nodePos, width=1, alpha=0.5)
nx.draw_networkx_labels(graph, nodePos, font_size=11, font_family='sans-serif')
if len(communities) > 0:
plt.legend(loc='upper left', prop={'size':11})
plt.title('Zachary Karate Club social network', fontsize=30)
plt.axis('off')
plt.show()
In [9]:
# Create output table objects
edges = s.CASTable('edges', replace=True)
nodes = s.CASTable('nodes', replace=True)
dataset[['SOURCE', 'TARGET']].hyperGroup(
createOut = 'never',
allGraphs = True,
edges = edges,
vertices = nodes
)
Out[9]:
In [10]:
renderNetworkGraph(size=10, sizeMultipler=2000)
Note: Network of the Zachary Karate Club. Distribution by degree of the node. Node 1 stands for the instructor, node 34 for the president
In [11]:
dataset[['SOURCE', 'TARGET']].hyperGroup(
createOut = 'never',
allGraphs = True,
community = True,
edges = edges,
vertices = nodes
)
Out[11]:
How many hypergroups and communities do we have?
In [12]:
nodes.distinct()
Out[12]:
In [13]:
nodes.summary()
Out[13]:
What are the 2 biggest communities?
In [14]:
topKOut = s.CASTable('topKOut', replace=True)
nodes[['_Community_']].topk(
aggregator = 'N',
topK = 4,
casOut = topKOut
)
topKOut = topKOut.sort_values('_Rank_').head(10)
topKOut.columns
Out[14]:
In [15]:
nCommunities = len(topKOut)
ind = np.arange(nCommunities) # the x locations for the groups
plt.figure(figsize=(8,4))
p1 = plt.bar(ind + 0.2, topKOut._Score_, 0.5, color='orange', alpha=0.75)
plt.ylabel('Vertices', fontsize=12)
plt.xlabel('Community', fontsize=12)
plt.title('Number of nodes for the top %s communities' % '{:2.0f}'.format(nCommunities))
plt.xticks(ind + 0.2, topKOut._Fmtvar_)
plt.show()
Note: This shows that the biggest communities have up to 18 vertices.
What nodes belong to community 4?
In [16]:
nodes.query('_Community_ EQ 1').head(5)
Out[16]:
What edges do we have?
In [17]:
edges.head(5)
Out[17]:
In [18]:
renderNetworkGraph(size=10, colorVar='_Community_', sizeMultipler=2000)
How important is a user in the network?
In [19]:
dataset[['SOURCE', 'TARGET']].hyperGroup(
createOut = 'never',
community = True,
centrality = True,
mergeCommSmallest = True,
allGraphs = True,
graphPartition = True,
scaleCentralities = 'central1', # Returns centrality values closer to 1 in the center
edges = edges,
vertices = nodes
)
Out[19]:
In [20]:
nodes.head()
Out[20]:
Between-ness centrality quantifies the number of times a node acts as a bridge along the shortest path(s) between two other nodes. As such it describes the importance of a node in a network.
In [21]:
renderNetworkGraph(size=10, colorVar='_Community_', sizeVar='_Betweenness_')
Only filter community 2.
In [22]:
renderNetworkGraph(1, size=10, sizeVar='_CentroidAngle_', sizeMultipler=5)
In [23]:
s.close()
Falko Schulz ▪ Principal Software Developer ▪ Business Intelligence Visualization R&D ▪ SAS® Institute ▪ falko.schulz@sas.com ▪ http://www.sas.com
In [ ]: